counts = read.table("Genes_per_sample_featurecounts.tabular", header=TRUE, row.names=1)
head(counts)
## SRR1554534 SRR1554535 SRR1554539 SRR1554537 SRR1554538 SRR1554541
## NM_000014.5 0 0 0 0 0 0
## NM_000015.3 6 4 4 2 6 4
## NM_000016.5 0 0 0 0 0 0
## NM_000017.4 44 29 27 16 32 16
## NM_000018.4 0 0 0 0 0 0
## NM_000019.4 1465 2990 2323 4505 5209 8137
counts_filtered = counts[rowSums(counts) > 0,]
colSums(counts_filtered)
## SRR1554534 SRR1554535 SRR1554539 SRR1554537 SRR1554538 SRR1554541
## 12808691 16532499 16128222 26936762 31970026 33466916
##Make plots of the raw data by sample
library(graphics)
plot(counts[,1], ylab=colnames(counts)[1])
plot(counts[,2], ylab=colnames(counts)[2])
plot(counts[,3], ylab=colnames(counts)[3])
plot(counts[,4], ylab=colnames(counts)[4])
plot(counts[,5], ylab=colnames(counts)[5])
plot(counts[,6], ylab=colnames(counts)[6])
plot(counts_filtered[,1], ylab=colnames(counts_filtered)[1])
plot(counts_filtered[,2], ylab=colnames(counts_filtered)[2])
plot(counts_filtered[,3], ylab=colnames(counts_filtered)[3])
plot(counts_filtered[,4], ylab=colnames(counts_filtered)[4])
plot(counts_filtered[,5], ylab=colnames(counts_filtered)[5])
plot(counts_filtered[,6], ylab=colnames(counts_filtered)[6])
##Make plots of the log2 transformed data
plot(log2(counts_filtered[,1]+1), ylab=colnames(counts_filtered)[1])
plot(log2(counts_filtered[,2]+1), ylab=colnames(counts_filtered)[2])
plot(log2(counts_filtered[,3]+1), ylab=colnames(counts_filtered)[3])
plot(log2(counts_filtered[,4]+1), ylab=colnames(counts_filtered)[4])
plot(log2(counts_filtered[,5]+1), ylab=colnames(counts_filtered)[5])
plot(log2(counts_filtered[,6]+1), ylab=colnames(counts_filtered)[6])
##Make boxplots of the counts filtered and log2 transformed
par(mar=c(7,5,1,1))
boxplot(counts_filtered, las=2, main="filtered data")
boxplot(log2(counts_filtered+1), las=2, main="log2 transformed data")
##Read phenotype data downloaded from NCBI and add a column for age group
phenotype = read.table("phenotype_data_capstone_genomics_coursera.txt", header=TRUE, dec=".")
phenotype["age_group"]=as.factor(c("adult","adult", "adult", "fetal", "fetal", "fetal"))
##Annotate gene names
library(annotate)
## Loading required package: AnnotationDbi
## Loading required package: stats4
## Loading required package: BiocGenerics
## Loading required package: parallel
##
## Attaching package: 'BiocGenerics'
## The following objects are masked from 'package:parallel':
##
## clusterApply, clusterApplyLB, clusterCall, clusterEvalQ,
## clusterExport, clusterMap, parApply, parCapply, parLapply,
## parLapplyLB, parRapply, parSapply, parSapplyLB
## The following objects are masked from 'package:stats':
##
## IQR, mad, sd, var, xtabs
## The following objects are masked from 'package:base':
##
## anyDuplicated, append, as.data.frame, basename, cbind, colnames,
## dirname, do.call, duplicated, eval, evalq, Filter, Find, get, grep,
## grepl, intersect, is.unsorted, lapply, Map, mapply, match, mget,
## order, paste, pmax, pmax.int, pmin, pmin.int, Position, rank,
## rbind, Reduce, rownames, sapply, setdiff, sort, table, tapply,
## union, unique, unsplit, which.max, which.min
## Loading required package: Biobase
## Welcome to Bioconductor
##
## Vignettes contain introductory material; view with
## 'browseVignettes()'. To cite Bioconductor, see
## 'citation("Biobase")', and for packages 'citation("pkgname")'.
## Loading required package: IRanges
## Loading required package: S4Vectors
##
## Attaching package: 'S4Vectors'
## The following object is masked from 'package:base':
##
## expand.grid
## Loading required package: XML
library(org.Hs.eg.db)
##
org.Hs.eg()
## Quality control information for org.Hs.eg:
##
##
## This package has the following mappings:
##
## org.Hs.egACCNUM has 40109 mapped keys (of 61547 keys)
## org.Hs.egACCNUM2EG has 794946 mapped keys (of 794946 keys)
## org.Hs.egALIAS2EG has 125573 mapped keys (of 125573 keys)
## org.Hs.egCHR has 61403 mapped keys (of 61547 keys)
## org.Hs.egCHRLENGTHS has 595 mapped keys (of 595 keys)
## org.Hs.egCHRLOC has 28306 mapped keys (of 61547 keys)
## org.Hs.egCHRLOCEND has 28306 mapped keys (of 61547 keys)
## org.Hs.egENSEMBL has 35228 mapped keys (of 61547 keys)
## org.Hs.egENSEMBL2EG has 38282 mapped keys (of 38282 keys)
## org.Hs.egENSEMBLPROT has 7004 mapped keys (of 61547 keys)
## org.Hs.egENSEMBLPROT2EG has 21951 mapped keys (of 21951 keys)
## org.Hs.egENSEMBLTRANS has 13080 mapped keys (of 61547 keys)
## org.Hs.egENSEMBLTRANS2EG has 39166 mapped keys (of 39166 keys)
## org.Hs.egENZYME has 2229 mapped keys (of 61547 keys)
## org.Hs.egENZYME2EG has 975 mapped keys (of 975 keys)
## org.Hs.egGENENAME has 61547 mapped keys (of 61547 keys)
## org.Hs.egGO has 20692 mapped keys (of 61547 keys)
## org.Hs.egGO2ALLEGS has 22780 mapped keys (of 22780 keys)
## org.Hs.egGO2EG has 18348 mapped keys (of 18348 keys)
## org.Hs.egMAP has 59023 mapped keys (of 61547 keys)
## org.Hs.egMAP2EG has 2002 mapped keys (of 2002 keys)
## org.Hs.egOMIM has 16208 mapped keys (of 61547 keys)
## org.Hs.egOMIM2EG has 21951 mapped keys (of 21951 keys)
## org.Hs.egPATH has 5868 mapped keys (of 61547 keys)
## org.Hs.egPATH2EG has 229 mapped keys (of 229 keys)
## org.Hs.egPMID has 38673 mapped keys (of 61547 keys)
## org.Hs.egPMID2EG has 667264 mapped keys (of 667264 keys)
## org.Hs.egREFSEQ has 38808 mapped keys (of 61547 keys)
## org.Hs.egREFSEQ2EG has 281994 mapped keys (of 281994 keys)
## org.Hs.egSYMBOL has 61547 mapped keys (of 61547 keys)
## org.Hs.egSYMBOL2EG has 61538 mapped keys (of 61538 keys)
## org.Hs.egUCSCKG has 27348 mapped keys (of 61547 keys)
## org.Hs.egUNIGENE has 26001 mapped keys (of 61547 keys)
## org.Hs.egUNIGENE2EG has 29207 mapped keys (of 29207 keys)
## org.Hs.egUNIPROT has 19082 mapped keys (of 61547 keys)
##
##
## Additional Information about this package:
##
## DB schema: HUMAN_DB
## DB schema version: 2.1
## Organism: Homo sapiens
## Date for NCBI data: 2020-Sep23
## Date for GO data: 2020-09-10
## Date for KEGG data: 2011-Mar15
## Date for Golden Path data: 2020-Aug27
## Date for Ensembl data: 2020-Aug18
select(org.Hs.eg.db, keys="NM_001001578", columns = "SYMBOL", keytype = "ACCNUM")
## 'select()' returned 1:1 mapping between keys and columns
## ACCNUM SYMBOL
## 1 NM_001001578 PDE9A
names = substr(rownames(counts), 1, nchar(rownames(counts))-2)
rownames(counts) = names
gen_symbols = select(org.Hs.eg.db, keys=names, columns = "SYMBOL", keytype = "ACCNUM")
## 'select()' returned 1:1 mapping between keys and columns
counts["symbol"] = gen_symbols[,2]
##Further filter the data to include only genes with counts higher than 10
counts_filtered10 = counts[rowSums(counts[1:6]) > 10,]
##Generate expression counts by gene
counts_bygene = aggregate(counts_filtered10[,1:6], by=list(counts_filtered10[,"symbol"]), sum)
rownames(counts_bygene) = counts_bygene[,1]
counts_bygene$Group.1 <- NULL
head(counts_bygene)
## SRR1554534 SRR1554535 SRR1554539 SRR1554537 SRR1554538 SRR1554541
## A1BG 246 248 185 445 248 464
## A1BG-AS1 96 54 65 101 47 122
## A2ML1 43 26 17 19 22 9
## A2MP1 1 11 14 12 13 13
## A3GALT2 6 5 2 13 28 8
## A4GNT 1 4 6 0 9 19
##Generate boxplots of counts by gene
par(mar=c(7,5,1,1))
boxplot(counts_bygene, las=2, ylab = "counts")
##Log2 transform the counts by sample
l2_counts = log2(counts_bygene+1)
##Generate a Summarized Experiment object
library(SummarizedExperiment)
## Loading required package: MatrixGenerics
## Loading required package: matrixStats
##
## Attaching package: 'matrixStats'
## The following objects are masked from 'package:Biobase':
##
## anyMissing, rowMedians
##
## Attaching package: 'MatrixGenerics'
## The following objects are masked from 'package:matrixStats':
##
## colAlls, colAnyNAs, colAnys, colAvgsPerRowSet, colCollapse,
## colCounts, colCummaxs, colCummins, colCumprods, colCumsums,
## colDiffs, colIQRDiffs, colIQRs, colLogSumExps, colMadDiffs,
## colMads, colMaxs, colMeans2, colMedians, colMins, colOrderStats,
## colProds, colQuantiles, colRanges, colRanks, colSdDiffs, colSds,
## colSums2, colTabulates, colVarDiffs, colVars, colWeightedMads,
## colWeightedMeans, colWeightedMedians, colWeightedSds,
## colWeightedVars, rowAlls, rowAnyNAs, rowAnys, rowAvgsPerColSet,
## rowCollapse, rowCounts, rowCummaxs, rowCummins, rowCumprods,
## rowCumsums, rowDiffs, rowIQRDiffs, rowIQRs, rowLogSumExps,
## rowMadDiffs, rowMads, rowMaxs, rowMeans2, rowMedians, rowMins,
## rowOrderStats, rowProds, rowQuantiles, rowRanges, rowRanks,
## rowSdDiffs, rowSds, rowSums2, rowTabulates, rowVarDiffs, rowVars,
## rowWeightedMads, rowWeightedMeans, rowWeightedMedians,
## rowWeightedSds, rowWeightedVars
## The following object is masked from 'package:Biobase':
##
## rowMedians
## Loading required package: GenomicRanges
## Loading required package: GenomeInfoDb
## Warning: package 'GenomeInfoDb' was built under R version 4.0.4
phenotype[,4] = as.factor(phenotype[,4])
Cdata = SummarizedExperiment(assays=counts_bygene, colData=phenotype)
##Generate objects for the DSeq2 package analysis using transformed and untransformed data Generate model with all factors for transformed and untransformed data, and a simple model including only age group for the transformed data
library(DESeq2)
dseq = DESeqDataSetFromMatrix(assay(Cdata), colData(Cdata), ~ age_group+sex+RIN)
## the design formula contains one or more numeric variables that have mean or
## standard deviation larger than 5 (an arbitrary threshold to trigger this message).
## it is generally a good idea to center and scale numeric variables in the design
## to improve GLM convergence.
dseq_not_transformed = DESeqDataSetFromMatrix(assay(Cdata), colData(Cdata), ~ age_group+sex+RIN)
## the design formula contains one or more numeric variables that have mean or
## standard deviation larger than 5 (an arbitrary threshold to trigger this message).
## it is generally a good idea to center and scale numeric variables in the design
## to improve GLM convergence.
dseq_age = DESeqDataSetFromMatrix(assay(Cdata), colData(Cdata), ~ age_group)
##Normalize the data and perform PCA of DSeq2 object
dseq_norm = varianceStabilizingTransformation(dseq)
count_pca = prcomp(assay(dseq_norm), center=TRUE, scale=TRUE)
pca = princomp(assay(dseq_norm), cor=TRUE)
##Generate plots of the PCA using ggplot First, transform the count_pca object to a dataframe to use in ggplot, then generate the plots
library(ggplot2)
dat = data.frame(X=count_pca$rotation[,1], Y=count_pca$rotation[,2], age_group=phenotype$age_group, RIN=phenotype$RIN, SEX=phenotype$sex)
boxplot(log2(counts_bygene+1), las=2, ylab = "log2(counts+1)", cex.axis=0.8, names = c("adult -SRR1554534", "adult - SRR1554535", "adult - SRR1554539", "fetal - SRR1554537", "fetal - SRR1554538", "fetal - SRR1554541"))
boxplot(assay(dseq_norm), las=2, ylab = "transformed by variance stabilizing", cex.axis=0.8, names = c("adult -SRR1554534", "adult - SRR1554535", "adult - SRR1554539", "fetal - SRR1554537", "fetal - SRR1554538", "fetal - SRR1554541"))
plot(pca)
##Run DESeq2 analysis for transformed and untransformed data Run DESeq2 analysis for transformed and untransformed data with all variables as response, and with only age group as response for the transformed data, to compare the number of differentially expressed genes with a p-adjusted value lower than 0.05
dseq_nottransformed_results = DESeq(dseq_not_transformed)
## estimating size factors
## estimating dispersions
## gene-wise dispersion estimates
## mean-dispersion relationship
## final dispersion estimates
## fitting model and testing
dseq_results = DESeq(dseq)
## estimating size factors
## estimating dispersions
## gene-wise dispersion estimates
## mean-dispersion relationship
## final dispersion estimates
## fitting model and testing
dseq_age_results = DESeq(dseq_age)
## estimating size factors
## estimating dispersions
## gene-wise dispersion estimates
## mean-dispersion relationship
## final dispersion estimates
## fitting model and testing
sum(results(dseq_nottransformed_results)$padj < 0.05, na.rm=TRUE)
## [1] 85
sum(results(dseq_results)$padj < 0.05, na.rm=TRUE)
## [1] 85
sum(results(dseq_age_results)$padj < 0.05, na.rm=TRUE)
## [1] 6894
##Extract and log transform data for limma analysis
edata = assay(Cdata)
head(edata)
## SRR1554534 SRR1554535 SRR1554539 SRR1554537 SRR1554538 SRR1554541
## A1BG 246 248 185 445 248 464
## A1BG-AS1 96 54 65 101 47 122
## A2ML1 43 26 17 19 22 9
## A2MP1 1 11 14 12 13 13
## A3GALT2 6 5 2 13 28 8
## A4GNT 1 4 6 0 9 19
edata_lg2 = log2(edata + 1)
head(edata_lg2)
## SRR1554534 SRR1554535 SRR1554539 SRR1554537 SRR1554538 SRR1554541
## A1BG 7.948367 7.960002 7.539159 8.800900 7.960002 8.861087
## A1BG-AS1 6.599913 5.781360 6.044394 6.672425 5.584963 6.942515
## A2ML1 5.459432 4.754888 4.169925 4.321928 4.523562 3.321928
## A2MP1 1.000000 3.584963 3.906891 3.700440 3.807355 3.807355
## A3GALT2 2.807355 2.584963 1.584963 3.807355 4.857981 3.169925
## A4GNT 1.000000 2.321928 2.807355 0.000000 3.321928 4.321928
dim(edata_lg2)
## [1] 17550 6
edata_lg2_filt = edata_lg2[rowMeans(edata_lg2) > 10, ]
dim(edata_lg2_filt)
## [1] 3199 6
##Run limma analysis
library(limma)
##
## Attaching package: 'limma'
## The following object is masked from 'package:DESeq2':
##
## plotMA
## The following object is masked from 'package:BiocGenerics':
##
## plotMA
library(edgeR)
model = model.matrix(~ Cdata$age_group)
fit_limma = lmFit(edata_lg2_filt, model)
ebayes = eBayes(fit_limma)
ebayes
## An object of class "MArrayLM"
## $coefficients
## (Intercept) Cdata$age_groupfetal
## AARS1 14.23026 -0.5186347
## AARS2 10.39848 1.1021997
## AASDHPPT 12.49400 1.5562765
## AASS 10.01977 1.1593046
## AATF 10.28662 1.1379707
## 3194 more rows ...
##
## $rank
## [1] 2
##
## $assign
## [1] 0 1
##
## $qr
## $qr
## (Intercept) Cdata$age_groupfetal
## 1 -2.4494897 -1.2247449
## 2 0.4082483 1.2247449
## 3 0.4082483 0.2898979
## 4 0.4082483 -0.5265986
## 5 0.4082483 -0.5265986
## 6 0.4082483 -0.5265986
## attr(,"assign")
## [1] 0 1
## attr(,"contrasts")
## attr(,"contrasts")$`Cdata$age_group`
## [1] "contr.treatment"
##
##
## $qraux
## [1] 1.408248 1.289898
##
## $pivot
## [1] 1 2
##
## $tol
## [1] 1e-07
##
## $rank
## [1] 2
##
##
## $df.residual
## [1] 4 4 4 4 4
## 3194 more elements ...
##
## $sigma
## AARS1 AARS2 AASDHPPT AASS AATF
## 0.1460217 0.1973929 0.7993312 0.8863305 0.2548009
## 3194 more elements ...
##
## $cov.coefficients
## (Intercept) Cdata$age_groupfetal
## (Intercept) 0.3333333 -0.3333333
## Cdata$age_groupfetal -0.3333333 0.6666667
##
## $stdev.unscaled
## (Intercept) Cdata$age_groupfetal
## AARS1 0.5773503 0.8164966
## AARS2 0.5773503 0.8164966
## AASDHPPT 0.5773503 0.8164966
## AASS 0.5773503 0.8164966
## AATF 0.5773503 0.8164966
## 3194 more rows ...
##
## $pivot
## [1] 1 2
##
## $Amean
## AARS1 AARS2 AASDHPPT AASS AATF
## 13.97094 10.94958 13.27213 10.59942 10.85561
## 3194 more elements ...
##
## $method
## [1] "ls"
##
## $design
## (Intercept) Cdata$age_groupfetal
## 1 1 0
## 2 1 0
## 3 1 0
## 4 1 1
## 5 1 1
## 6 1 1
## attr(,"assign")
## [1] 0 1
## attr(,"contrasts")
## attr(,"contrasts")$`Cdata$age_group`
## [1] "contr.treatment"
##
##
## $df.prior
## [1] 4.238586
##
## $s2.prior
## [1] 0.1275318
##
## $var.prior
## [1] 125.4589 121.1807
##
## $proportion
## [1] 0.01
##
## $s2.post
## AARS1 AARS2 AASDHPPT AASS AATF
## 0.07596496 0.08453032 0.37582612 0.44702837 0.09713421
## 3194 more elements ...
##
## $t
## (Intercept) Cdata$age_groupfetal
## AARS1 89.42659 -2.304626
## AARS2 61.94758 4.643010
## AASDHPPT 35.29950 3.109130
## AASS 25.95677 2.123614
## AATF 57.16725 4.471885
## 3194 more rows ...
##
## $df.total
## [1] 8.238586 8.238586 8.238586 8.238586 8.238586
## 3194 more elements ...
##
## $p.value
## (Intercept) Cdata$age_groupfetal
## AARS1 1.332123e-13 0.049201073
## AARS2 2.731016e-12 0.001535283
## AASDHPPT 2.763568e-10 0.013951233
## AASS 3.407894e-09 0.065462632
## AATF 5.285168e-12 0.001931586
## 3194 more rows ...
##
## $lods
## (Intercept) Cdata$age_groupfetal
## AARS1 18.33334 -4.917174
## AARS2 17.11436 -1.326569
## AASDHPPT 14.09566 -3.642994
## AASS 11.93187 -5.196401
## AATF 16.76922 -1.570223
## 3194 more rows ...
##
## $F
## [1] 7710.9646 4265.8205 1410.9320 756.2181 3649.6299
## 3194 more elements ...
##
## $F.p.value
## [1] 3.307805e-14 3.783242e-13 3.578336e-11 4.623020e-10 7.189030e-13
## 3194 more elements ...
limma_table = topTable(ebayes, number=length(rownames(edata_lg2_filt)))
## Removing intercept from test coefficients
head(limma_table)
## logFC AveExpr t P.Value adj.P.Val B
## SOX11 9.678600 12.27098 40.10630 9.706908e-11 3.105240e-07 13.83080
## DRAXIN 7.861109 11.45553 27.91827 1.881400e-09 3.009299e-06 11.93896
## SOX4 7.628643 12.74369 24.82583 4.898471e-09 4.587674e-06 11.20921
## GABRD -5.668479 10.00311 -24.34880 5.736385e-09 4.587674e-06 11.08372
## MEX3B 5.224093 11.39089 22.64248 1.035470e-08 5.720225e-06 10.60233
## BHLHE22 5.893903 11.60899 22.46396 1.104209e-08 5.720225e-06 10.54883
limma_table_output = limma_table[,c(1,4,5)]
head(limma_table_output)
## logFC P.Value adj.P.Val
## SOX11 9.678600 9.706908e-11 3.105240e-07
## DRAXIN 7.861109 1.881400e-09 3.009299e-06
## SOX4 7.628643 4.898471e-09 4.587674e-06
## GABRD -5.668479 5.736385e-09 4.587674e-06
## MEX3B 5.224093 1.035470e-08 5.720225e-06
## BHLHE22 5.893903 1.104209e-08 5.720225e-06
##Save tab delimited file of all differentially expressed genes
write.table(limma_table_output, file="genes_dif_expr.txt", sep="\t", row.names = TRUE, col.names = TRUE)
##Generates Volcano plot
par(mar=c(5,5,1,1))
with(limma_table, plot(logFC, -log10(adj.P.Val), pch=20, main="Volcano plot"))
with(subset(limma_table, adj.P.Val < 0.05), points(logFC, -log10(adj.P.Val), pch=20, col="red"))
##Filter differentially expressed genes by adjusted values of p lower than 0.05 Filter the differentially expressed genes by adjusted p values and generates objects for upregulated and downregulated genes
dif_expr = limma_table_output[limma_table_output$adj.P.Val < 0.05,]
downreg = limma_table_output[limma_table_output$adj.P.Val < 0.05 & limma_table_output$logFC > 1,]
upreg = limma_table_output[limma_table_output$adj.P.Val < 0.05 & limma_table_output$logFC < -1,]
dim(dif_expr)
## [1] 2261 3
dim(downreg)
## [1] 1723 3
dim(upreg)
## [1] 205 3
##Generates boxplots of differentially expressed genes
edata_l2filt_difexp = edata_lg2_filt[rownames(dif_expr),]
edata_l2filt_downreg = edata_lg2_filt[rownames(downreg),]
edata_l2filt_upreg = edata_lg2_filt[rownames(upreg),]
par(mar=c(8,5,1,1))
boxplot(edata_l2filt_difexp, las=2, main="diferential expression by sample", ylab = "log2(counts+1)", cex.axis=0.8, names = c("adult -SRR1554534", "adult - SRR1554535", "adult - SRR1554539", "fetal - SRR1554537", "fetal - SRR1554538", "fetal - SRR1554541"))
boxplot(edata_l2filt_downreg, las=2, main="downregulated genes by sample", ylab = "log2(counts+1)", cex.axis=0.8, names = c("adult -SRR1554534", "adult - SRR1554535", "adult - SRR1554539", "fetal - SRR1554537", "fetal - SRR1554538", "fetal - SRR1554541"))
boxplot(edata_l2filt_upreg, las=2, main="upregulated genes by sample", ylab = "log2(counts+1)", cex.axis=0.8, names = c("adult -SRR1554534", "adult - SRR1554535", "adult - SRR1554539", "fetal - SRR1554537", "fetal - SRR1554538", "fetal - SRR1554541"))
##Generate dispersion plot of the total number of up and downregulated genes by sample
library(ggrepel)
up_down = data.frame(colSums(edata_l2filt_upreg), colSums(edata_l2filt_downreg))
colnames(up_down) = c("upregulated genes [log2(counts+1)]", "downregulated genes [log2(counts+1)]")
row.names(up_down) = c("adult - SRR1554534", "adult - SRR1554535", "adult - SRR1554539", "fetal - SRR1554537", "fetal - SRR1554538", "fetal - SRR1554541")
up_down_plot = ggplot(up_down, aes(x= `upregulated genes [log2(counts+1)]`, y=`downregulated genes [log2(counts+1)]`))+geom_point(size=5)
up_down_plot + geom_label_repel(aes(label=row.names(up_down)), size=3)
##Get methylation narrow peaks for H3K4me3 using AnnotationHub, to compare with the diffentially expressed genes
library(AnnotationHub)
## Loading required package: BiocFileCache
## Loading required package: dbplyr
##
## Attaching package: 'AnnotationHub'
## The following object is masked from 'package:Biobase':
##
## cache
ah <- AnnotationHub()
## snapshotDate(): 2020-10-27
ah <- subset(ah, species == "Homo sapiens")
ah_fetal <- query(ah, c("EpigenomeRoadMap", "H3K4me3"))
fetal_met <- ah_fetal[["AH30471"]]
## loading from cache
## require("rtracklayer")
##
## Attaching package: 'Biostrings'
## The following object is masked from 'package:base':
##
## strsplit
ah_adult <- query(ah, c("EpigenomeRoadMap", "H3K4me3"))
adult_met <- ah_adult[["AH30413"]]
## loading from cache
ah_liver <- query(ah, c("EpigenomeRoadMap", "H3K4me3"))
liver_met <- ah_liver[["AH30367"]]
## loading from cache
##Get gene Symbols from limma results and convert to Entrez ID
library(mygene)
## Loading required package: GenomicFeatures
## Warning: package 'GenomicFeatures' was built under R version 4.0.4
##
## Attaching package: 'mygene'
## The following object is masked from 'package:AnnotationHub':
##
## query
dif_exp_genes = row.names(limma_table[limma_table$adj.P.Val < 0.05,])
dif_exp_gene_ids = queryMany(dif_exp_genes, scopes = "symbol", fields = "entrezgene", species = "human" )
## Querying chunk 1
## Querying chunk 2
## Querying chunk 3
## Finished
## Pass returnall=TRUE to return lists of duplicate or missing query terms.
head(dif_exp_gene_ids)
## DataFrame with 6 rows and 5 columns
## query _id X_score entrezgene notfound
## <character> <character> <numeric> <character> <logical>
## 1 SOX11 6664 89.6143 6664 NA
## 2 DRAXIN 374946 86.8990 374946 NA
## 3 SOX4 6659 88.9639 6659 NA
## 4 GABRD 2563 86.0339 2563 NA
## 5 MEX3B 84206 86.5313 84206 NA
## 6 BHLHE22 27319 86.7396 27319 NA
dif_exp_gene_entrez = na.omit(dif_exp_gene_ids$entrezgene)
##Generate intervals of genes plus promoters for the differentially expressed genes
library(TxDb.Hsapiens.UCSC.hg19.knownGene)
txdb <- TxDb.Hsapiens.UCSC.hg19.knownGene
txdb_genes <- genes(txdb)
## 403 genes were dropped because they have exons located on both strands
## of the same reference sequence or on more than one reference sequence,
## so cannot be represented by a single genomic range.
## Use 'single.strand.genes.only=FALSE' to get all the genes in a
## GRangesList object, or use suppressMessages() to suppress this message.
dif_exp_promoters <- promoters(txdb_genes[dif_exp_gene_entrez %in% txdb_genes$gene_id])
dif_exp_prom = promoters(txdb_genes[txdb_genes$gene_id %in% dif_exp_gene_ids$entrezgene])
####Get the overlaping intervals between all genes/promoters and methylation marks
fetal_ov_promoters = subsetByOverlaps(fetal_met, dif_exp_promoters)
adult_ov_promoters = subsetByOverlaps(adult_met, dif_exp_promoters)
liver_ov_promoters = subsetByOverlaps(liver_met, dif_exp_promoters)
##Get the overlaping intervals between differentially expressed genes/promoters and methylation marks
adult_ov_prom = subsetByOverlaps(adult_met, dif_exp_prom)
fetal_ov_prom = subsetByOverlaps(fetal_met, dif_exp_prom)
liver_ov_prom = subsetByOverlaps(liver_met, dif_exp_prom)
##Generate Venn Diagrams and hypergeometric test between the overlaping intervals
library(ChIPpeakAnno)
##
## Attaching package: 'ChIPpeakAnno'
## The following object is masked from 'package:annotate':
##
## getGO
met_overlaps = findOverlapsOfPeaks(fetal_met, adult_met, liver_met, connectedPeaks = "merge")
## duplicated or NA names found.
## Rename all the names by numbers.
## duplicated or NA names found.
## Rename all the names by numbers.
## duplicated or NA names found.
## Rename all the names by numbers.
peaks_prom_overlaps = findOverlapsOfPeaks(fetal_ov_prom, adult_ov_prom, liver_ov_prom, connectedPeaks = "merge")
## duplicated or NA names found.
## Rename all the names by numbers.
## duplicated or NA names found.
## Rename all the names by numbers.
## duplicated or NA names found.
## Rename all the names by numbers.
vennDiag_all_met <- makeVennDiagram(met_overlaps)
## Missing totalTest! totalTest is required for HyperG test.
## If totalTest is missing, pvalue will be calculated by estimating
## the total binding sites of encoding region of human.
## totalTest = humanGenomeSize * (2%(codingDNA) +
## 1%(regulationRegion)) / ( 2 * averagePeakWidth )
## = 3.3e+9 * 0.03 / ( 2 * averagePeakWidth)
## = 5e+7 /averagePeakWidth
HGtest <- makeVennDiagram(peaks_prom_overlaps, totalTest = 3500)
HGtest$p.value
## fetal_ov_prom adult_ov_prom liver_ov_prom pval
## [1,] 0 1 1 6.049833e-117
## [2,] 1 0 1 4.562069e-166
## [3,] 1 1 0 2.923927e-127
HGtest$vennCounts
## fetal_ov_prom adult_ov_prom liver_ov_prom Counts
## [1,] 0 0 0 338
## [2,] 0 0 1 163
## [3,] 0 1 0 459
## [4,] 0 1 1 733
## [5,] 1 0 0 37
## [6,] 1 0 1 2
## [7,] 1 1 0 98
## [8,] 1 1 1 1670
## attr(,"class")
## [1] "VennCounts"
##Turn Venn counts into a matrix
m = matrix(nrow=7, ncol=3)
m[1,]=c(0, 0, 238)
m[2,]=c(0, 442, 0)
m[3,]=c(0, 718, 718)
m[4,]=c(37, 0, 0)
m[5,]=c(2, 0, 2)
m[6,]=c(96, 96, 0)
m[7,]=c(1619, 1619, 1619)
##Perform a Pearson’s Chi-squared test
chisq.test(m, rescale.p = TRUE, simulate.p.value = TRUE)
##
## Pearson's Chi-squared test with simulated p-value (based on 2000
## replicates)
##
## data: m
## X-squared = 2026.3, df = NA, p-value = 0.0004998
sessionInfo()
## R version 4.0.3 (2020-10-10)
## Platform: x86_64-apple-darwin17.0 (64-bit)
## Running under: macOS High Sierra 10.13.6
##
## Matrix products: default
## BLAS: /Library/Frameworks/R.framework/Versions/4.0/Resources/lib/libRblas.dylib
## LAPACK: /Library/Frameworks/R.framework/Versions/4.0/Resources/lib/libRlapack.dylib
##
## locale:
## [1] pt_BR.UTF-8/pt_BR.UTF-8/pt_BR.UTF-8/C/pt_BR.UTF-8/pt_BR.UTF-8
##
## attached base packages:
## [1] parallel stats4 stats graphics grDevices utils datasets
## [8] methods base
##
## other attached packages:
## [1] ChIPpeakAnno_3.24.1
## [2] TxDb.Hsapiens.UCSC.hg19.knownGene_3.2.2
## [3] mygene_1.26.0
## [4] GenomicFeatures_1.42.2
## [5] BSgenome.Hsapiens.UCSC.hg19_1.4.3
## [6] BSgenome_1.58.0
## [7] Biostrings_2.58.0
## [8] XVector_0.30.0
## [9] rtracklayer_1.50.0
## [10] AnnotationHub_2.22.0
## [11] BiocFileCache_1.14.0
## [12] dbplyr_2.1.0
## [13] ggrepel_0.9.1
## [14] edgeR_3.32.1
## [15] limma_3.46.0
## [16] ggplot2_3.3.3
## [17] DESeq2_1.30.1
## [18] SummarizedExperiment_1.20.0
## [19] GenomicRanges_1.42.0
## [20] GenomeInfoDb_1.26.4
## [21] MatrixGenerics_1.2.1
## [22] matrixStats_0.58.0
## [23] org.Hs.eg.db_3.12.0
## [24] annotate_1.68.0
## [25] XML_3.99-0.6
## [26] AnnotationDbi_1.52.0
## [27] IRanges_2.24.1
## [28] S4Vectors_0.28.1
## [29] Biobase_2.50.0
## [30] BiocGenerics_0.36.0
##
## loaded via a namespace (and not attached):
## [1] backports_1.2.1 Hmisc_4.5-0
## [3] plyr_1.8.6 lazyeval_0.2.2
## [5] splines_4.0.3 BiocParallel_1.24.1
## [7] digest_0.6.27 ensembldb_2.14.0
## [9] htmltools_0.5.1.1 fansi_0.4.2
## [11] magrittr_2.0.1 checkmate_2.0.0
## [13] memoise_2.0.0 cluster_2.1.1
## [15] askpass_1.1 prettyunits_1.1.1
## [17] jpeg_0.1-8.1 colorspace_2.0-0
## [19] blob_1.2.1 rappdirs_0.3.3
## [21] xfun_0.22 dplyr_1.0.5
## [23] crayon_1.4.1 RCurl_1.98-1.3
## [25] jsonlite_1.7.2 graph_1.68.0
## [27] genefilter_1.72.1 survival_3.2-10
## [29] glue_1.4.2 gtable_0.3.0
## [31] zlibbioc_1.36.0 DelayedArray_0.16.2
## [33] scales_1.1.1 futile.options_1.0.1
## [35] DBI_1.1.1 Rcpp_1.0.6
## [37] xtable_1.8-4 progress_1.2.2
## [39] htmlTable_2.1.0 foreign_0.8-81
## [41] bit_4.0.4 Formula_1.2-4
## [43] sqldf_0.4-11 htmlwidgets_1.5.3
## [45] httr_1.4.2 RColorBrewer_1.1-2
## [47] ellipsis_0.3.1 pkgconfig_2.0.3
## [49] farver_2.1.0 nnet_7.3-15
## [51] sass_0.3.1 locfit_1.5-9.4
## [53] utf8_1.2.1 tidyselect_1.1.0
## [55] labeling_0.4.2 rlang_0.4.10
## [57] later_1.1.0.1 munsell_0.5.0
## [59] BiocVersion_3.12.0 tools_4.0.3
## [61] cachem_1.0.4 gsubfn_0.7
## [63] generics_0.1.0 RSQLite_2.2.4
## [65] evaluate_0.14 stringr_1.4.0
## [67] fastmap_1.1.0 yaml_2.2.1
## [69] knitr_1.31 bit64_4.0.5
## [71] purrr_0.3.4 AnnotationFilter_1.14.0
## [73] KEGGREST_1.30.1 RBGL_1.66.0
## [75] mime_0.10 formatR_1.8
## [77] xml2_1.3.2 biomaRt_2.46.3
## [79] compiler_4.0.3 rstudioapi_0.13
## [81] curl_4.3 png_0.1-7
## [83] interactiveDisplayBase_1.28.0 tibble_3.1.0
## [85] geneplotter_1.68.0 bslib_0.2.4
## [87] stringi_1.5.3 futile.logger_1.4.3
## [89] highr_0.8 lattice_0.20-41
## [91] ProtGenerics_1.22.0 Matrix_1.3-2
## [93] multtest_2.46.0 vctrs_0.3.6
## [95] pillar_1.5.1 lifecycle_1.0.0
## [97] BiocManager_1.30.10 jquerylib_0.1.3
## [99] data.table_1.14.0 bitops_1.0-6
## [101] httpuv_1.5.5 R6_2.5.0
## [103] latticeExtra_0.6-29 promises_1.2.0.1
## [105] gridExtra_2.3 lambda.r_1.2.4
## [107] MASS_7.3-53.1 assertthat_0.2.1
## [109] chron_2.3-56 proto_1.0.0
## [111] openssl_1.4.3 withr_2.4.1
## [113] regioneR_1.22.0 GenomicAlignments_1.26.0
## [115] Rsamtools_2.6.0 GenomeInfoDbData_1.2.4
## [117] hms_1.0.0 VennDiagram_1.6.20
## [119] grid_4.0.3 rpart_4.1-15
## [121] rmarkdown_2.7 shiny_1.6.0
## [123] base64enc_0.1-3